{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "from pprint import pprint\n",
    "clinical_df = pd.read_excel(\"20200414.xlsx\", sheet_name=0)\n",
    "library_df = pd.read_excel(\"20200414.xlsx\", sheet_name=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "total sampleId in clinical_df: 2357; unique sampleId(count=1) in clinical_df: 2065\n"
    }
   ],
   "source": [
    "sampleId_count_clinical = clinical_df[\"样本编号\"].value_counts()\n",
    "sampleId_unique_clinical = sampleId_count_clinical.index[sampleId_count_clinical==1]\n",
    "print(\"total sampleId in clinical_df: {}; unique sampleId(count=1) in clinical_df: {}\".format(clinical_df.shape[0], sampleId_unique_clinical.shape[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "total sampleId in library_df: 2197; unique sampleId(count=1) in library_df: 1408\n"
    }
   ],
   "source": [
    "library_df.loc[:, \"虚拟入库样本名\"] = library_df.loc[:, \"虚拟入库样本名\"].fillna(method='ffill')\n",
    "library_df.loc[:, \"pooling比例\"] = library_df.loc[:, \"pooling比例\"].fillna(method='ffill')\t\n",
    "library_df.loc[:, \"取样ng\"] = library_df.loc[:, \"取样ng\"].fillna(method='ffill')\t\n",
    "library_df.loc[:, \"体积ul\"] = library_df.loc[:, \"体积ul\"].fillna(method='ffill')\t\n",
    "library_df.loc[:, \"杂交探针\"] = library_df.loc[:, \"杂交探针\"].fillna(method='ffill')\t\n",
    "library_df.loc[:, \"杂交时间h\"] = library_df.loc[:, \"杂交时间h\"].fillna(method='ffill')\t\n",
    "library_df.loc[:, \"PostPCR循环数\"] = library_df.loc[:, \"PostPCR循环数\"].fillna(method='ffill')\t\n",
    "library_df.loc[:, \"PostPCR浓度ng/ul\"] = library_df.loc[:, \"PostPCR浓度ng/ul\"].fillna(method='ffill')\t\n",
    "library_df.loc[:, \"洗脱体积\"] = library_df.loc[:, \"洗脱体积\"].fillna(method='ffill')\t\n",
    "library_df.loc[:, \"文库质控\"] = library_df.loc[:, \"文库质控\"].fillna(method='ffill')\t\n",
    "library_df.loc[:, \"文库名称\"] = library_df.loc[:, \"文库名称\"].fillna(method='ffill')\n",
    "library_df = library_df.fillna('无')\n",
    "library_df[\"华大样本编号P\"] = library_df[\"华大样本编号\"].str.slice_replace(2, 3, repl='P')\n",
    "tmp = [ str(int(x)) if x!=\"无\" else x for x in library_df.loc[:,\"建库日期\"].to_list()]\n",
    "library_df[\"建库日期\"] = [ x[:4]+\"/\"+x[4:6]+\"/\"+x[6:] if x!=\"无\" else \"\" for x in  tmp]\n",
    "sampleId_count_libaray = library_df[\"华大样本编号P\"].value_counts()\n",
    "sampleId_unique_libaray = sampleId_count_libaray.index[sampleId_count_libaray==1]\n",
    "print(\"total sampleId in library_df: {}; unique sampleId(count=1) in library_df: {}\".format(library_df.shape[0], sampleId_unique_libaray.shape[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "overlap unique sampleId(count=1) in clinical_df and library_df: 1105\n"
    }
   ],
   "source": [
    "sampleId_unique_overlap = sampleId_unique_libaray[sampleId_unique_libaray.isin(sampleId_unique_clinical)]\n",
    "print(\"overlap unique sampleId(count=1) in clinical_df and library_df: {}\".format(sampleId_unique_overlap.shape[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "library_df_filt = library_df.loc[library_df[\"华大样本编号P\"].isin(sampleId_unique_overlap)]\n",
    "library_df_filt[\"建库编号\"] = library_df[\"华大样本编号\"].str.replace(re.compile(r'$'), '_1_m1')\n",
    "library_df_filt[\"DNA提取编号\"] = library_df[\"华大样本编号\"].str.replace(re.compile(r'$'), '_1')\n",
    "count = {}\n",
    "libraryId_list = []\n",
    "for dnaId in library_df_filt.loc[:, \"DNA提取编号\"]:\n",
    "    dnaId = str(dnaId)\n",
    "    count[dnaId] = count[dnaId]+1 if dnaId in count else 1\n",
    "    libraryId_list.append(\"{}_m{}\".format(dnaId, count[dnaId]))\n",
    "\n",
    "\n",
    "clinical_df_filt = clinical_df.loc[clinical_df[\"样本编号\"].isin(sampleId_unique_overlap)]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "library_df_filt.loc[:, \"文库名称\"] = library_df_filt.loc[:, \"文库名称\"].str.strip()\n",
    "library_df_filt.loc[:, \"捕获文库操作人\"] = library_df_filt.loc[:, \"文库名称\"].str.extract(r'（(.+)）', flags=re.IGNORECASE, expand=True).iloc[:,0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "captureId = [ str(x) for x in library_df_filt.loc[:, \"文库名称\"]]\n",
    "# print(\"before:{}\".format(captureId[:30]))\n",
    "# print(\"before:{}\".format(captureId[-30:]))\n",
    "# print(\">>>>>>>>>\")\n",
    "for idx in range(len(captureId)):\n",
    "    captureId[idx] = re.sub(r'（\\w+）', \"\", captureId[idx])\n",
    "    captureId[idx] = re.sub(r'-cpg.*?-', r'-CpGiant-', captureId[idx], flags=re.IGNORECASE)\n",
    "    captureId[idx] = re.sub(r'-(\\d{6})$', r'-20\\1', str(captureId[idx]))\n",
    "    captureId[idx] = re.sub(r'-(\\d{6})-', r'-20\\1-', str(captureId[idx]))\n",
    "    captureId[idx] = re.sub(r'ssM-(\\d+)-CpGiant-(\\d{8})', r'ssM-CpGiant-\\1-\\2', captureId[idx])\n",
    "    captureId[idx] = re.sub(r'ssM-CpGiant-(\\d{8})-(\\d+)', r'ssM-CpGiant-\\2-\\1', captureId[idx])\n",
    "# print(\"after:{}\".format(captureId[:30]))\n",
    "# print(\"after:{}\".format(captureId[-30:]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "tags": [
     "outputPrepend"
    ]
   },
   "outputs": [],
   "source": [
    "year_pre = int(re.match(r'^.+-(\\d{4})\\d{4}$', captureId[0]).group(1))\n",
    "for idx in range(len(captureId)):\n",
    "    try:\n",
    "        year = int(re.match(r'^.+-(\\d{4})\\d{4}$', captureId[idx]).group(1))\n",
    "        if year<year_pre:\n",
    "            captureId[idx] = re.sub(r'^(.+)-\\d{4}(\\d{4})$', r'\\1-'+str(year_pre)+r'\\2', captureId[idx])\n",
    "        else:\n",
    "            year_pre = year\n",
    "    except:\n",
    "        pass\n",
    "# captureId\n",
    "\n",
    "library_df_filt.loc[:, \"文库名称\"] = captureId\n",
    "library_df_filt.loc[:, \"杂交日期\"] = [ re.sub(r'^.+-(\\d{4})(\\d{2})(\\d{2})$', r\"\\1/\\2/\\3\", x) if x!='-' else '-' for x in captureId]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.ExcelWriter(\"library.20200424.xlsx\") as writer:\n",
    "    library_df_filt.to_excel(writer, sheet_name=\"建库表(clinicalDf_overlap_libraryDf)\", index=False)\n",
    "    clinical_df_filt.to_excel(writer, sheet_name=\"临床信息表(clinicalDf_overlap_libraryDf)\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of                  建库编号       DNA提取编号 管上编号    是否临床        文库名 样本标签 index列表  \\\n226   18S4146706_1_m1  18S4146706_1    1  Clinic      ssM-1    P   57,58   \n227   18S4146707_1_m1  18S4146707_1    2  Clinic      ssM-2    P   59,60   \n228   18S4146708_1_m1  18S4146708_1    3  Clinic      ssM-3    P   61,62   \n229   18S4146709_1_m1  18S4146709_1    4  Clinic      ssM-4    P   63,64   \n230   18S4146710_1_m1  18S4146710_1    5  Clinic      ssM-5    P   65,66   \n...               ...           ...  ...     ...        ...  ...     ...   \n2192  20L5383633_1_m1  20L5383633_1   12  Clinic  ssM-12-pl    T      68   \n2193  20S5383594_1_m1  20S5383594_1   13  Clinic  ssM-13-pl    T      69   \n2194  20L5383634_1_m1  20L5383634_1   14  Clinic  ssM-14-pl    T      70   \n2195  20S5383595_1_m1  20S5383595_1   15  Clinic  ssM-15-pl    T      71   \n2196  20L5383635_1_m1  20L5383635_1   16  Clinic  ssM-16-pl    T      72   \n\n            建库日期 建库方法    试剂批次  ...                               杂交探针 杂交时间h  \\\n226   2018/12/11    1       无  ...  SeqCap Epi CpGiant Enrichment Kit  66.5   \n227   2018/12/11    1       无  ...  SeqCap Epi CpGiant Enrichment Kit  66.5   \n228   2018/12/11    1       无  ...  SeqCap Epi CpGiant Enrichment Kit  66.5   \n229   2018/12/11    1       无  ...  SeqCap Epi CpGiant Enrichment Kit  66.5   \n230   2018/12/11    1       无  ...  SeqCap Epi CpGiant Enrichment Kit  67.5   \n...          ...  ...     ...  ...                                ...   ...   \n2192  2020/02/10    2  Lot009  ...  SeqCap Epi CpGiant Enrichment Kit  70.5   \n2193  2020/02/10    2  Lot009  ...  SeqCap Epi CpGiant Enrichment Kit  70.5   \n2194  2020/02/10    2  Lot009  ...  SeqCap Epi CpGiant Enrichment Kit  70.5   \n2195  2020/02/10    2  Lot009  ...  SeqCap Epi CpGiant Enrichment Kit  70.5   \n2196  2020/02/10    2  Lot009  ...  SeqCap Epi CpGiant Enrichment Kit  70.5   \n\n     PostPCR循环数 PostPCR浓度ng/ul 洗脱体积 文库质控                    文库名称     华大样本编号P  \\\n226        12.0           16.2   40    -  ssM-CpGiant-1-20181218  18P4146706   \n227        12.0           16.2   40    -  ssM-CpGiant-1-20181218  18P4146707   \n228        12.0           16.2   40    -  ssM-CpGiant-1-20181218  18P4146708   \n229        12.0           16.2   40    -  ssM-CpGiant-1-20181218  18P4146709   \n230        13.0           17.6   40    -  ssM-CpGiant-2-20181218  18P4146710   \n...         ...            ...  ...  ...                     ...         ...   \n2192        9.0           51.6   40    -  ssM-CpGiant-6-20200214  20P5383633   \n2193        9.0           51.6   40    -  ssM-CpGiant-6-20200214  20P5383594   \n2194        9.0           51.6   40    -  ssM-CpGiant-6-20200214  20P5383634   \n2195        9.0           51.6   40    -  ssM-CpGiant-6-20200214  20P5383595   \n2196        9.0           51.6   40    -  ssM-CpGiant-6-20200214  20P5383635   \n\n     捕获文库操作人        杂交日期  \n226      NaN  2018/12/18  \n227      NaN  2018/12/18  \n228      NaN  2018/12/18  \n229      NaN  2018/12/18  \n230      NaN  2018/12/18  \n...      ...         ...  \n2192     LZL  2020/02/14  \n2193     LZL  2020/02/14  \n2194     LZL  2020/02/14  \n2195     LZL  2020/02/14  \n2196     LZL  2020/02/14  \n\n[1105 rows x 34 columns]>"
     },
     "metadata": {},
     "execution_count": 10
    }
   ],
   "source": [
    "library_df_filt.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python36764bit590e0209625c41dab6b137cee9f052d1",
   "display_name": "Python 3.6.7 64-bit"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}